{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests as req\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "unique_dois = set()\n",
    "\n",
    "condition = True\n",
    "start = 0\n",
    "total = 0\n",
    "per_page = 1000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Looking at files from page 0\n",
      "Looking at files from page 1000\n",
      "Looking at files from page 2000\n",
      "Looking at files from page 3000\n",
      "Looking at files from page 4000\n",
      "Looking at files from page 5000\n",
      "Looking at files from page 6000\n",
      "Looking at files from page 7000\n"
     ]
    }
   ],
   "source": [
    "while condition:\n",
    "    # check for native Harvard files with isHarvested%3Afalse\n",
    "    query = \"https://dataverse.harvard.edu/api/search?q=(fileContentType%3Atype%2Fx-r-syntax%20AND%20isHarvested%3Afalse)&type=file\"\n",
    "    query = query + \"&start=\" + str(start)\n",
    "    query = query + \"&per_page=\" + str(per_page)\n",
    "    print \"Looking at files from page \" + str(start)\n",
    "    \n",
    "    res = req.get(query)\n",
    "    res_dict = json.loads(res.text)\n",
    "    \n",
    "    total = res_dict['data']['total_count']\n",
    "    for item in res_dict['data']['items']:\n",
    "        unique_dois.add(item['dataset_persistent_id'])\n",
    "    \n",
    "    start = start + per_page\n",
    "    condition = start < total"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-- Summary --\n",
      "Total number of R files: 7857\n",
      "Total number of datasets that contain R files: 2002\n",
      "Number of R files per dataset: 3.92457542458\n"
     ]
    }
   ],
   "source": [
    "print(\"-- Summary --\")\n",
    "print(\"Total number of R files: \" + str(total))\n",
    "print(\"Total number of datasets that contain R files: \" + str(len(unique_dois)))\n",
    "print(\"Number of R files per dataset: \" + str(total/float(len(unique_dois))))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dataset_dois.txt', 'w') as f:\n",
    "    for doi in unique_dois:\n",
    "        f.write(\"%s\\n\" % doi)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "env"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
